CGC clustering¶

Install some packages¶

pip install matplotlib¶
pip install pandas¶
pip install joblib¶
pip install sklearn¶
pip install plotly¶

If you meet an error, please add "--user" at the end. For example, try "pip install sklearn --user".¶

In your terminal, type "python" to start the python.¶

Load Libraries¶

In [1]:
# for dealing with file directories
import os

# for making the plots
import matplotlib.pyplot as plt

# for dealing with dataframes
import pandas as pd

# for parallel computation
from joblib import Parallel, delayed

# for k mean clustering
from sklearn.cluster import KMeans

# for converting text sequences to features
from sklearn.feature_extraction.text import CountVectorizer

1. Prepare Data¶

In [2]:
# Set the your working directory
# os.chdir(r"/work/$your_group/$your_name/workshop_2022/CGC_clustering")
# example:
os.chdir(r"/work/yinlab/tangli/workshop_2022/CGC_clustering")
In [3]:
# Read dataframe
all_unsupervised = pd.read_csv("cgc_seq_substr_list.txt", sep="\t", header=None)
In [4]:
# Name columns for the dataframe
all_unsupervised.columns = [
    "Genome_ID",
    "CGC_ID",
    "sig_gene_seq",
    "low_level_substr",
    "Species",
]
In [5]:
all_unsupervised.head()
Out[5]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375
4 MGYG000000013 MGYG000000013_1|CGC5 GT2,1.E.14,2.A.122,GH105,PL42,GH43_24,GH154,HT... lactose,host glycan Bacteroides sp902362375
In [6]:
# View the shape of dataframe
all_unsupervised.shape
Out[6]:
(3352, 5)
In [7]:
# There are a few duplicates in the dataframe
all_unsupervised["sig_gene_seq"].value_counts()
Out[7]:
9.B.146,2.A.103,GT28                               24
GT28,2.A.103,9.B.146                               18
CBM50,3.A.5                                        17
1.B.14,8.A.46,GH18                                 16
3.A.5,CBM50                                        15
                                                   ..
2.A.66,GH16_3                                       1
GH95,1.B.14,GH2,GH35,GH43_10,1.B.14                 1
1.B.42,Aminotran_1_2,CBM67|GH78                     1
3.A.3,3.A.3,3.A.3,2.A.21,GerE,FecR,1.B.14,GH115     1
1.A.43,GH3,GH158,GH16_3,1.B.14                      1
Name: sig_gene_seq, Length: 2463, dtype: int64
In [8]:
# Show an example for duplicates
all_unsupervised[all_unsupervised["sig_gene_seq"] == "9.B.146,2.A.103,GT28"]
Out[8]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species
10 MGYG000000013 MGYG000000013_1|CGC11 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp902362375
266 MGYG000000057 MGYG000000057_3|CGC2 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp002491635
396 MGYG000000105 MGYG000000105_3|CGC2 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides clarus
669 MGYG000000224 MGYG000000224_6|CGC5 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp003545565
707 MGYG000000236 MGYG000000236_3|CGC7 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides fragilis_A
888 MGYG000000652 MGYG000000652_11|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides togonis
903 MGYG000000675 MGYG000000675_1|CGC3 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides congonensis
1288 MGYG000001345 MGYG000001345_36|CGC11 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides xylanisolvens
1537 MGYG000001378 MGYG000001378_14|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides ovatus
1586 MGYG000001422 MGYG000001422_2|CGC10 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides oleiciplenus
1701 MGYG000001433 MGYG000001433_1|CGC39 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides salyersiae
1759 MGYG000001461 MGYG000001461_1|CGC10 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides neonati
1826 MGYG000001661 MGYG000001661_1|CGC8 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides gallinarum
1927 MGYG000002273 MGYG000002273_9|CGC3 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides stercorirosoris
2392 MGYG000002549 MGYG000002549_18|CGC7 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides caccae
2566 MGYG000003064 MGYG000003064_3|CGC7 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides graminisolvens
2714 MGYG000003252 MGYG000003252_545|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900761785
2739 MGYG000003312 MGYG000003312_4|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900547205
2922 MGYG000003363 MGYG000003363_45|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900766195
3004 MGYG000003367 MGYG000003367_48|CGC5 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp007896885
3157 MGYG000004019 MGYG000004019_6|CGC3 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides massiliensis_A
3195 MGYG000004185 MGYG000004185_1|CGC6 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900553815
3240 MGYG000004676 MGYG000004676_2|CGC3 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900766005
3310 MGYG000004899 MGYG000004899_9|CGC2 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900555635
In [9]:
# Remove duplicates
all_unsupervised = all_unsupervised.drop_duplicates("sig_gene_seq")
all_unsupervised.shape
Out[9]:
(2463, 5)
In [10]:
# Write out a csv file
all_unsupervised.to_csv("unsupervised_cgc_data.tsv", index=False, sep="\t")
In [11]:
all_unsupervised.head()
Out[11]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375
4 MGYG000000013 MGYG000000013_1|CGC5 GT2,1.E.14,2.A.122,GH105,PL42,GH43_24,GH154,HT... lactose,host glycan Bacteroides sp902362375
In [12]:
# Some low-level subratrates have multiple categories in one string
# removing those
all_unsupervised["lens"] = [
    len(str(seq).split(",")) for seq in all_unsupervised["low_level_substr"]
]
In [13]:
# Keep only the clean ones (one subratrate)
all_unsupervised = all_unsupervised[all_unsupervised["lens"] == 1]
all_unsupervised
Out[13]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species lens
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375 1
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375 1
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375 1
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375 1
5 MGYG000000013 MGYG000000013_1|CGC6 2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_... arabinoxylan Bacteroides sp902362375 1
... ... ... ... ... ... ...
3340 MGYG000004899 MGYG000004899_23|CGC1 3.A.5,GH36,HTH_AraC,GH2|CBM32,1.B.14,GH27,HTH_... xylan Bacteroides sp900555635 1
3342 MGYG000004899 MGYG000004899_28|CGC1 GT0,GT4,9.B.18,GT0 capsule polysaccharide Bacteroides sp900555635 1
3343 MGYG000004899 MGYG000004899_30|CGC1 2.A.6,2.A.6,GH5_4,HTH_AraC,1.B.14,8.A.46,GH3,2... beta-glucan Bacteroides sp900555635 1
3344 MGYG000004899 MGYG000004899_31|CGC1 1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8.... host glycan Bacteroides sp900555635 1
3345 MGYG000004899 MGYG000004899_32|CGC1 PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.... rhamnogalacturonan Bacteroides sp900555635 1

1931 rows × 6 columns

In [14]:
# all_unsupervised = pd.concat([all_unsupervised_clean], ignore_index = True)
In [15]:
# Count the number of cgc sequences for each subratrate
all_unsupervised["low_level_substr"].value_counts()
Out[15]:
host glycan               346
capsule polysaccharide    201
rhamnogalacturonan        122
glycosaminoglycan         120
pectin                    108
                         ... 
fucosyllactose              1
cellobiose                  1
trehalose                   1
sorbitol                    1
beta-glucoside              1
Name: low_level_substr, Length: 64, dtype: int64
In [16]:
all_unsupervised["low_level_substr"]
Out[16]:
0                galactomannan
1                 alpha-mannan
2                  host glycan
3                       starch
5                 arabinoxylan
                 ...          
3340                     xylan
3342    capsule polysaccharide
3343               beta-glucan
3344               host glycan
3345        rhamnogalacturonan
Name: low_level_substr, Length: 1931, dtype: object
In [17]:
# Need to decide on delimeters
all_unsupervised[["sig_gene_seq"]].values[-1][0]
Out[17]:
'PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.B.14,CBM67|GH78,HTH_AraC,GH106'
In [18]:
# Treat "|" as ","
all_unsupervised[["sig_gene_seq"]].values[-1][0].replace("|", ",")
Out[18]:
'PL8_3,CBM67,GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.B.14,CBM67,GH78,HTH_AraC,GH106'
In [19]:
# Split the cgc sequences
all_unsupervised[["sig_gene_seq"]].values[-1][0].replace("|", ",").split(",")
Out[19]:
['PL8_3',
 'CBM67',
 'GH78',
 'GH28',
 'GH88',
 'GH2',
 'PL8_3',
 '8.A.46',
 '1.B.14',
 'CBM67',
 'GH78',
 'HTH_AraC',
 'GH106']
In [20]:
all_unsupervised.head()
Out[20]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species lens
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375 1
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375 1
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375 1
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375 1
5 MGYG000000013 MGYG000000013_1|CGC6 2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_... arabinoxylan Bacteroides sp902362375 1
In [21]:
# check if there is missing value
all_unsupervised.isnull().sum()
# Drop missing values if possible
# all_unsupervised = all_unsupervised.dropna()
Out[21]:
Genome_ID           0
CGC_ID              0
sig_gene_seq        0
low_level_substr    0
Species             0
lens                0
dtype: int64
In [22]:
# Only keep subratrate classes which have more than 2 occurrences
count_df = pd.DataFrame(
    all_unsupervised["low_level_substr"].value_counts()[
        all_unsupervised["low_level_substr"].value_counts() >= 2
    ]
).reset_index()
In [23]:
# Get their names
to_keep_substrates = count_df["index"].values
to_keep_substrates
Out[23]:
array(['host glycan', 'capsule polysaccharide', 'rhamnogalacturonan',
       'glycosaminoglycan', 'pectin', 'alpha-mannan', 'N-glycan',
       'homogalacturonan', 'xylan', 'arabinoxylan', 'porphyran',
       'hemicellulose', 'mucin', 'acetylated glucuronoxylan',
       'beta-glucan', ' ', 'plant polysaccharide', 'xyloglucan',
       'arabinogalactan', 'fructan', 'O-antigen', 'galactomannan',
       'exopolysaccharide', 'sialoglycoconjugate', 'dextran', 'starch',
       'arabinan', 'cellulose', 'glycogen', 'carrageenan', 'alginate',
       'maltose', 'beta-mannan', 'O-glycan', 'pullulan', 'mannose',
       'alpha-glucan', 'galactan', 'sialic acid', 'ribose', 'chitin',
       'xanthan', 'arabinose', 'maltooligosaccharide',
       '4-methylumbelliferyl 6-azido-6-deoxy-beta-D-galactoside',
       'acarbose', 'glucomannan', 'glucose'], dtype=object)
In [24]:
# Subset to only those that are more than 2
all_unsupervised = all_unsupervised[
    all_unsupervised["low_level_substr"].isin(to_keep_substrates)
]
In [25]:
all_unsupervised
Out[25]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species lens
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375 1
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375 1
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375 1
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375 1
5 MGYG000000013 MGYG000000013_1|CGC6 2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_... arabinoxylan Bacteroides sp902362375 1
... ... ... ... ... ... ...
3340 MGYG000004899 MGYG000004899_23|CGC1 3.A.5,GH36,HTH_AraC,GH2|CBM32,1.B.14,GH27,HTH_... xylan Bacteroides sp900555635 1
3342 MGYG000004899 MGYG000004899_28|CGC1 GT0,GT4,9.B.18,GT0 capsule polysaccharide Bacteroides sp900555635 1
3343 MGYG000004899 MGYG000004899_30|CGC1 2.A.6,2.A.6,GH5_4,HTH_AraC,1.B.14,8.A.46,GH3,2... beta-glucan Bacteroides sp900555635 1
3344 MGYG000004899 MGYG000004899_31|CGC1 1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8.... host glycan Bacteroides sp900555635 1
3345 MGYG000004899 MGYG000004899_32|CGC1 PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.... rhamnogalacturonan Bacteroides sp900555635 1

1915 rows × 6 columns

In [26]:
# View all cgc sequences in the subset
all_unsupervised["sig_gene_seq"]
Out[26]:
0                                   1.B.14,GH29,GH16,GH76
1                    1.B.14,HTH_AraC,GH92,GH76,GH130,GH92
2                                      GH18,8.A.46,1.B.14
3                                        GH13,GH97,1.B.14
5       2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_...
                              ...                        
3340    3.A.5,GH36,HTH_AraC,GH2|CBM32,1.B.14,GH27,HTH_...
3342                                   GT0,GT4,9.B.18,GT0
3343    2.A.6,2.A.6,GH5_4,HTH_AraC,1.B.14,8.A.46,GH3,2...
3344    1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8....
3345    PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1....
Name: sig_gene_seq, Length: 1915, dtype: object
In [27]:
# Write selected out a csv file
all_unsupervised.to_csv("unsupervised_cgc_selected.tsv", index=False, sep="\t")

2. Train and Test Subsets¶

In [28]:
# Train test split - Split arrays or matrices into random train and test subsets.
from sklearn.model_selection import train_test_split
In [29]:
# Train test split
# Split original data into 70% for clustering and 30% for finding the mappings
X_train, X_test, y_train, y_test = train_test_split(
    all_unsupervised["sig_gene_seq"],
    all_unsupervised["low_level_substr"],
    test_size=0.3,
    stratify=all_unsupervised["low_level_substr"].values,
)

## X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=array-like)
## test_size: the proportion of the dataset to include in the train split
## stratify: If not None, data is split in a stratified fashion, using this as the class labels.
In [30]:
# Supervised is for finding the mappings
X_supervised = pd.concat([X_test, y_test], 1).reset_index(drop=True)
X_supervised
/tmp/ipykernel_3088779/4208520920.py:2: FutureWarning: In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only
  X_supervised = pd.concat([X_test, y_test], 1).reset_index(drop=True)
Out[30]:
sig_gene_seq low_level_substr
0 GH2,GH84,2.A.37 rhamnogalacturonan
1 GH97,GH92,8.A.46,1.B.14 mucin
2 3.A.1,HTH_AraC,2.A.2,GH43_1,GH67 xylan
3 GT2,1.B.14,1.B.14,GH92,GH28,GH92,GH89 alpha-mannan
4 2.A.21,3.A.10,GH63,GH2,2.A.124,2.A.124,1.B.14 rhamnogalacturonan
... ... ...
570 2.A.114,GH141,PL12_2,1.B.14,1.B.14,GH29,HTH_Ar... host glycan
571 2.A.13,3.A.11,GH123
572 2.A.81,1.B.14,2.A.4,GH20 N-glycan
573 1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8.... host glycan
574 GH84,GH97,1.B.42,3.A.1,3.A.1 acetylated glucuronoxylan

575 rows × 2 columns

In [31]:
# View data for clustering
X_train
Out[31]:
737                               GH29,8.A.46,1.B.14,GH97
1478    2.A.21,GerE,FecR,1.B.14,TPR_1,PL35,GH88,GerE,F...
2366                              1.B.14,CBM32,GH2,2.A.88
170                 2.A.66,GT94,GT0,GT4,GT4,GT4,GT2,GH115
907     3.A.11,PL11_1,GH105,GH28,GH154,GH88,GH2,1.B.14...
                              ...                        
3148    PL8_3,PL38,GH88,1.B.14,GH29,GH31,GH2,PL38,GH10...
895                                        2.A.1,GH2,GH35
2223                                   GT2,2.A.66,GT4,GT2
2650    1.B.14,Sigma70_r2|Sigma70_r4_2,GH28,HTH_AraC,1...
113            GH2,GH53,8.A.46,1.B.14,GH147,HTH_AraC,PL13
Name: sig_gene_seq, Length: 1340, dtype: object

3. Convert words to vectors¶

In [32]:
# Countvectorizer
# To convert a collection of text documents to a vector of term/token counts
count_vectorizer = CountVectorizer(
    tokenizer=lambda x: str(x).replace("|", ",").split(","), lowercase=False
)
In [33]:
# Fit the count vectorizer (learn a vocabulary dictionary of all tokens in the raw documents)
count_vectorizer.fit(X_train)
/util/opt/anaconda/deployed-conda-envs/packages/biopython/envs/biopython-1.79-py39/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:489: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn("The parameter 'token_pattern' will not be used"
Out[33]:
CountVectorizer(lowercase=False,
                tokenizer=<function <lambda> at 0x1489152db1f0>)
In [34]:
# Transform texts to sequences
count_vectorized_array = count_vectorizer.transform(X_train)
In [35]:
count_vectorized_array
# count_vectorized_array.toarray()
Out[35]:
<1340x415 sparse matrix of type '<class 'numpy.int64'>'
	with 7112 stored elements in Compressed Sparse Row format>
In [36]:
# vocabulary - dictionary containing the word (key) and vector (value)
vocabulary = count_vectorizer.vocabulary_
vocabulary
# View
first20vocabulary = {k: vocabulary[k] for k in sorted(vocabulary.keys())[:20]}
first20vocabulary
Out[36]:
{'1.A.1': 0,
 '1.A.13': 1,
 '1.A.22': 2,
 '1.A.23': 3,
 '1.A.26': 4,
 '1.A.28': 5,
 '1.A.30': 6,
 '1.A.33': 7,
 '1.A.34': 8,
 '1.A.35': 9,
 '1.A.62': 10,
 '1.A.78': 11,
 '1.B.14': 12,
 '1.B.17': 13,
 '1.B.18': 14,
 '1.B.44': 15,
 '1.B.5': 16,
 '1.B.6': 17,
 '1.B.9': 18,
 '1.C.105': 19}
In [37]:
# Inverse vocabulary
inv_map_vocab = {v: k for k, v in vocabulary.items()}
# View
first20_inv_vocabulary = {
    k: inv_map_vocab[k] for k in sorted(inv_map_vocab.keys())[:20]
}
first20_inv_vocabulary
Out[37]:
{0: '1.A.1',
 1: '1.A.13',
 2: '1.A.22',
 3: '1.A.23',
 4: '1.A.26',
 5: '1.A.28',
 6: '1.A.30',
 7: '1.A.33',
 8: '1.A.34',
 9: '1.A.35',
 10: '1.A.62',
 11: '1.A.78',
 12: '1.B.14',
 13: '1.B.17',
 14: '1.B.18',
 15: '1.B.44',
 16: '1.B.5',
 17: '1.B.6',
 18: '1.B.9',
 19: '1.C.105'}

4. CGC Clustering¶

In [38]:
# Affinity Propagation - a machine learning algorithm that can find data centers or clusters
# by sending messages between pairs of data points.
# does not need to know number of clusters
from sklearn.cluster import AffinityPropagation
In [39]:
# Fit the clustering algorithm on the unsupervised data
clustering = AffinityPropagation(random_state=5).fit(count_vectorized_array)
# random_state: pseudo-random number generator to control the starting state.
In [40]:
clustering.labels_
Out[40]:
array([ 58,  38,  11, ..., 143, 158,  40])
In [41]:
# For counting
# Counter: it is a collection where elements are stored as dictionary keys
# and their counts are stored as dictionary values.
from collections import Counter
In [42]:
# Count for number of clusters
cluster_freq_info = pd.DataFrame(Counter(clustering.labels_), index=[0]).T.reset_index()
# T: Transpose index and columns
# view
cluster_freq_info.shape
Out[42]:
(159, 2)
In [43]:
# Name columns and view
cluster_freq_info.columns = ["cluster_id", "number of samples"]
cluster_freq_info.head()
Out[43]:
cluster_id number of samples
0 58 33
1 38 14
2 11 28
3 62 20
4 69 5
In [44]:
# Select the top clusters for visualization
cluster_freq_info = cluster_freq_info.sort_values("number of samples", ascending=False)
In [45]:
cluster_freq_info
Out[45]:
cluster_id number of samples
50 104 59
13 1 36
0 58 33
48 70 33
17 88 32
... ... ...
7 0 1
27 3 1
81 29 1
83 30 1
158 150 1

159 rows × 2 columns

In [46]:
# We will create clusters to substrate mappings,
# convert strings to vector,
# pass the supervised strings through the count vectorizer

supervised_seqs_array = count_vectorizer.transform(X_supervised["sig_gene_seq"].values)
In [47]:
# supervised_seqs_array
supervised_seqs_array_dense = supervised_seqs_array.todense()
supervised_seqs_array_dense
Out[47]:
matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])
In [48]:
# Use the fit cluster object to predict clusters on the supervised set
supervised_set_clusters = clustering.predict(supervised_seqs_array_dense)
In [49]:
# Start to create the mapping
mapping_df = pd.DataFrame(X_supervised["low_level_substr"].values)
In [50]:
# Put the predicted clusters
mapping_df["cluster_id"] = supervised_set_clusters
In [51]:
# Sort and name the columns
mapping_df = mapping_df.sort_values("cluster_id")
mapping_df.columns = ["low_level_substr", "cluster_id"]
mapping_df
Out[51]:
low_level_substr cluster_id
162 xanthan 1
532 alpha-mannan 1
295 carrageenan 1
508 pectin 1
230 host glycan 1
... ... ...
25 rhamnogalacturonan 154
249 rhamnogalacturonan 154
487 xylan 155
175 sialoglycoconjugate 157
168 N-glycan 157

575 rows × 2 columns

In [52]:
# Aggregate substrate by using the cluster id
mapping_df = pd.DataFrame(
    mapping_df.groupby("cluster_id")["low_level_substr"].aggregate(pd.Series.mode)
).reset_index()
In [53]:
# Remove [] from low_level_substr column
catch = []
for seq in mapping_df["low_level_substr"].values:
    seq = str(seq).replace("]", "").replace("[", "")
    catch.append(seq)
In [54]:
mapping_df["low_level_substr"] = catch
mapping_df
Out[54]:
cluster_id low_level_substr
0 1 host glycan
1 2 mucin
2 5 capsule polysaccharide
3 7 xylan
4 8 rhamnogalacturonan
... ... ...
110 152 beta-glucan
111 153 mucin
112 154 rhamnogalacturonan
113 155 xylan
114 157 'N-glycan' 'sialoglycoconjugate'

115 rows × 2 columns

5. Visualization¶

In [55]:
# TSNE for visualization
# T-distributed Stochastic Neighbor Embedding - visualize high-dimensional data.
from sklearn.manifold import TSNE
In [56]:
# Covert sparse matrix
count_vectorized_array
Out[56]:
<1340x415 sparse matrix of type '<class 'numpy.int64'>'
	with 7112 stored elements in Compressed Sparse Row format>
In [57]:
# 3D plot
X_embedded = TSNE(
    n_components=3, init="pca"  # learning_rate='auto',init='random'
).fit_transform(count_vectorized_array.toarray())

# n_components: dimension of the embedded space.
# init: initialization of embedding.
# fit_transform: fit X into an embedded space and return that transformed output.
In [58]:
# View dataframe
X_embedded_df = pd.DataFrame(X_embedded)
X_embedded_df
Out[58]:
0 1 2
0 27.550228 1.689406 1.121049
1 29.111511 -17.498882 27.308899
2 12.939676 -5.533157 -23.785519
3 -83.159393 19.958351 5.421502
4 52.284691 -21.424486 -20.653650
... ... ... ...
1335 64.076935 3.201710 -38.197628
1336 -14.928308 25.385132 27.029896
1337 -67.314041 6.404743 18.647472
1338 -20.898037 -34.193462 -14.455879
1339 57.091476 -29.505133 21.557560

1340 rows × 3 columns

In [59]:
# Add cluster_id column to dataframe
X_embedded_df["cluster_id"] = clustering.labels_
X_embedded_df
Out[59]:
0 1 2 cluster_id
0 27.550228 1.689406 1.121049 58
1 29.111511 -17.498882 27.308899 38
2 12.939676 -5.533157 -23.785519 11
3 -83.159393 19.958351 5.421502 62
4 52.284691 -21.424486 -20.653650 69
... ... ... ... ...
1335 64.076935 3.201710 -38.197628 98
1336 -14.928308 25.385132 27.029896 88
1337 -67.314041 6.404743 18.647472 143
1338 -20.898037 -34.193462 -14.455879 158
1339 57.091476 -29.505133 21.557560 40

1340 rows × 4 columns

In [60]:
# Merge mapping_df with X_embedded_df by cluster_id
X_embedded_df = X_embedded_df.merge(mapping_df, how="left", on="cluster_id")
In [61]:
X_embedded_df
Out[61]:
0 1 2 cluster_id low_level_substr
0 27.550228 1.689406 1.121049 58 alpha-mannan
1 29.111511 -17.498882 27.308899 38 'glucose' 'homogalacturonan' 'host glycan' 'mu...
2 12.939676 -5.533157 -23.785519 11 host glycan
3 -83.159393 19.958351 5.421502 62 capsule polysaccharide
4 52.284691 -21.424486 -20.653650 69 NaN
... ... ... ... ... ...
1335 64.076935 3.201710 -38.197628 98 'N-glycan' 'host glycan'
1336 -14.928308 25.385132 27.029896 88 glycosaminoglycan
1337 -67.314041 6.404743 18.647472 143 capsule polysaccharide
1338 -20.898037 -34.193462 -14.455879 158 NaN
1339 57.091476 -29.505133 21.557560 40 'exopolysaccharide' 'porphyran'

1340 rows × 5 columns

In [62]:
# View top 10 substrates
X_embedded_df["low_level_substr"].value_counts().head(n=10)
Out[62]:
host glycan                  294
capsule polysaccharide       137
glycosaminoglycan             94
pectin                        71
alpha-mannan                  54
acetylated glucuronoxylan     48
homogalacturonan              46
xylan                         38
N-glycan                      34
beta-glucan                   31
Name: low_level_substr, dtype: int64
In [63]:
# Select top 10 substrates
top_10_classes = X_embedded_df["low_level_substr"].value_counts()[:10]
top_10_classes
Out[63]:
host glycan                  294
capsule polysaccharide       137
glycosaminoglycan             94
pectin                        71
alpha-mannan                  54
acetylated glucuronoxylan     48
homogalacturonan              46
xylan                         38
N-glycan                      34
beta-glucan                   31
Name: low_level_substr, dtype: int64
In [64]:
# Make top 10 substrates to list
top_10_classes_list = list(pd.DataFrame(top_10_classes).index)
top_10_classes_list
Out[64]:
['host glycan',
 'capsule polysaccharide',
 'glycosaminoglycan',
 'pectin',
 'alpha-mannan',
 'acetylated glucuronoxylan',
 'homogalacturonan',
 'xylan',
 'N-glycan',
 'beta-glucan']
In [65]:
# Subset X_embedded_df dataframe by list
X_embedded_df_viz = X_embedded_df[
    X_embedded_df["low_level_substr"].isin(top_10_classes_list)
]
X_embedded_df_viz
Out[65]:
0 1 2 cluster_id low_level_substr
0 27.550228 1.689406 1.121049 58 alpha-mannan
2 12.939676 -5.533157 -23.785519 11 host glycan
3 -83.159393 19.958351 5.421502 62 capsule polysaccharide
5 -40.710545 -0.388352 -24.416637 152 beta-glucan
6 2.397810 -11.162458 31.722109 17 host glycan
... ... ... ... ... ...
1327 -59.101582 -15.625198 -7.301012 103 xylan
1328 65.735535 20.191668 -23.182314 27 host glycan
1331 -12.281227 29.358931 13.767683 88 glycosaminoglycan
1336 -14.928308 25.385132 27.029896 88 glycosaminoglycan
1337 -67.314041 6.404743 18.647472 143 capsule polysaccharide

847 rows × 5 columns

In [66]:
# astype:change a pandas object to a specified dtype
X_embedded_df_viz["low_level_substr"] = X_embedded_df_viz["low_level_substr"].astype(str)
/tmp/ipykernel_3088779/3803832902.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_embedded_df_viz["low_level_substr"] = X_embedded_df_viz["low_level_substr"].astype(str)
In [67]:
# plotly - an interactive, open-source, and browser-based graphing library
import plotly.express as px
In [68]:
fig = px.scatter_3d(X_embedded_df_viz, x=0, y=1, z=2, color="low_level_substr")
fig.write_html("cgc_clusters_3D_top10.html")

You can not see the plot in ternimal, it is OK. You can download the file to your own computer and open it in your browser.

In [69]:
# view plot
fig.show()
In [70]:
# view X_embedded_df dataframe
X_embedded_df.shape
Out[70]:
(1340, 5)
In [71]:
# Creat a new dataframe - summarize cgc sequences, cluster ids and substrates
final_df_cgc_cluster_substrate = X_embedded_df[["low_level_substr", "cluster_id"]]
In [72]:
final_df_cgc_cluster_substrate["sig_gene_seq"] = X_train.values
final_df_cgc_cluster_substrate["sig_gene_seq"]
/tmp/ipykernel_3088779/3070682039.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[72]:
0                                 GH29,8.A.46,1.B.14,GH97
1       2.A.21,GerE,FecR,1.B.14,TPR_1,PL35,GH88,GerE,F...
2                                 1.B.14,CBM32,GH2,2.A.88
3                   2.A.66,GT94,GT0,GT4,GT4,GT4,GT2,GH115
4       3.A.11,PL11_1,GH105,GH28,GH154,GH88,GH2,1.B.14...
                              ...                        
1335    PL8_3,PL38,GH88,1.B.14,GH29,GH31,GH2,PL38,GH10...
1336                                       2.A.1,GH2,GH35
1337                                   GT2,2.A.66,GT4,GT2
1338    1.B.14,Sigma70_r2|Sigma70_r4_2,GH28,HTH_AraC,1...
1339           GH2,GH53,8.A.46,1.B.14,GH147,HTH_AraC,PL13
Name: sig_gene_seq, Length: 1340, dtype: object
In [73]:
# Name the columns
final_df_cgc_cluster_substrate[["sig_gene_seq", "cluster_id", "low_level_substr"]]
Out[73]:
sig_gene_seq cluster_id low_level_substr
0 GH29,8.A.46,1.B.14,GH97 58 alpha-mannan
1 2.A.21,GerE,FecR,1.B.14,TPR_1,PL35,GH88,GerE,F... 38 'glucose' 'homogalacturonan' 'host glycan' 'mu...
2 1.B.14,CBM32,GH2,2.A.88 11 host glycan
3 2.A.66,GT94,GT0,GT4,GT4,GT4,GT2,GH115 62 capsule polysaccharide
4 3.A.11,PL11_1,GH105,GH28,GH154,GH88,GH2,1.B.14... 69 NaN
... ... ... ...
1335 PL8_3,PL38,GH88,1.B.14,GH29,GH31,GH2,PL38,GH10... 98 'N-glycan' 'host glycan'
1336 2.A.1,GH2,GH35 88 glycosaminoglycan
1337 GT2,2.A.66,GT4,GT2 143 capsule polysaccharide
1338 1.B.14,Sigma70_r2|Sigma70_r4_2,GH28,HTH_AraC,1... 158 NaN
1339 GH2,GH53,8.A.46,1.B.14,GH147,HTH_AraC,PL13 40 'exopolysaccharide' 'porphyran'

1340 rows × 3 columns

In [74]:
# View substrate by using cluster id
mapping_df[mapping_df["cluster_id"] == 0]
Out[74]:
cluster_id low_level_substr
In [75]:
# View number of sequences by using cluster id
cluster_freq_info[cluster_freq_info["cluster_id"] == 68]
Out[75]:
cluster_id number of samples
137 68 1
In [76]:
final_df_cgc_cluster_substrate["cluster_id"].value_counts()[:10]
Out[76]:
104    59
1      36
58     33
70     33
88     32
11     28
22     28
14     27
144    25
124    24
Name: cluster_id, dtype: int64
In [77]:
# Write out a csv file
final_df_cgc_cluster_substrate.to_csv("cgc_cluster_substrate.tsv", index=False, sep="\t")
In [78]:
# 2D plot
X_embedded_2D = TSNE(n_components=2, init="pca").fit_transform(
    count_vectorized_array.toarray()
)

X_embedded_2D_df = pd.DataFrame(X_embedded_2D)
X_embedded_2D_df["cluster_id"] = clustering.labels_
X_embedded_2D_df = X_embedded_2D_df.merge(mapping_df, how="left", on="cluster_id")
X_embedded_2D_df
Out[78]:
0 1 cluster_id low_level_substr
0 15.473319 7.805315 58 alpha-mannan
1 9.921034 -25.687372 38 'glucose' 'homogalacturonan' 'host glycan' 'mu...
2 15.637207 -15.620399 11 host glycan
3 -65.089294 4.293285 62 capsule polysaccharide
4 38.521568 -17.407051 69 NaN
... ... ... ... ...
1335 33.397781 -14.693295 98 'N-glycan' 'host glycan'
1336 -11.899568 15.598096 88 glycosaminoglycan
1337 -57.790684 13.014769 143 capsule polysaccharide
1338 -7.240588 -38.635487 158 NaN
1339 34.799389 4.621681 40 'exopolysaccharide' 'porphyran'

1340 rows × 4 columns

In [79]:
top_10_classes = X_embedded_2D_df["low_level_substr"].value_counts()[:10]
top_10_classes_list = list(pd.DataFrame(top_10_classes).index)

X_embedded_2D_df_viz = X_embedded_df[
    X_embedded_2D_df["low_level_substr"].isin(top_10_classes_list)
]
X_embedded_2D_df_viz["low_level_substr"] = X_embedded_2D_df_viz[
    "low_level_substr"
].astype(str)

fig = px.scatter(X_embedded_2D_df_viz, x=0, y=1, color="low_level_substr")
fig.write_html("cgc_clusters_2D_top10.html")
fig.show()
/tmp/ipykernel_3088779/1996784998.py:7: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy